epl_data <- read.csv("/Users/ryanlee/Desktop/R/R Projects/Football/dataset/epl.csv")

ycard_data <- epl_data |> 
  select(HomeTeam, AwayTeam, Referee, HY, AY)

rcard_data <- epl_data %>%
  select(HomeTeam, AwayTeam, Referee, HR, AR)
epl_data |> 
  select(c(FTHG, FTAG, HTHG, HTAG)) |> 
  ggpairs(cardinality_threshold = 50, progress = FALSE)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

referee_counts <- table(epl_data$Referee)

# Get a list of qualified referees with 20 or more games
qualified_referees <- names(referee_counts[referee_counts >= 20])

# Filter the epl_data data frame to only include games refereed by qualified referees
filtered_epl_data <- epl_data[epl_data$Referee %in% qualified_referees, ]

epl_data <- filtered_epl_data

head(epl_data)
##       Date   HomeTeam       AwayTeam FTHG FTAG FTR HTHG HTAG HTR     Referee HS
## 1 16/08/14    Arsenal Crystal Palace    2    1   H    1    1   D      J Moss 14
## 2 16/08/14  Leicester        Everton    2    2   D    1    2   A     M Jones 11
## 3 16/08/14 Man United        Swansea    1    2   A    0    1   A      M Dean 14
## 4 16/08/14        QPR           Hull    0    1   A    0    0   D    C Pawson 19
## 5 16/08/14      Stoke    Aston Villa    0    1   A    0    0   D    A Taylor 12
## 6 16/08/14  West Brom     Sunderland    2    2   D    1    1   D N Swarbrick 10
##   AS HST AST HF AF HC AC HY AY HR AR
## 1  4   6   2 13 19  9  3  2  2  0  1
## 2 13   3   3 16 10  3  6  1  1  0  0
## 3  5   5   4 14 20  4  0  2  4  0  0
## 4 11   6   4 10 10  8  9  1  2  0  0
## 5  7   2   2 14  9  2  8  0  3  0  0
## 6  7   5   2 18  9  6  3  3  1  0  0
skim(epl_data)
Data summary
Name epl_data
Number of rows 1857
Number of columns 22
_______________________
Column type frequency:
character 6
numeric 16
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Date 0 1 6 10 0 515 0
HomeTeam 0 1 3 14 0 29 0
AwayTeam 0 1 3 14 0 29 0
FTR 0 1 1 1 0 3 0
HTR 0 1 1 1 0 3 0
Referee 0 1 6 13 0 20 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
FTHG 0 1 1.53 1.30 0 1 1 2 8 ▇▅▁▁▁
FTAG 0 1 1.18 1.16 0 0 1 2 7 ▇▂▂▁▁
HTHG 0 1 0.68 0.84 0 0 0 1 5 ▇▁▁▁▁
HTAG 0 1 0.53 0.74 0 0 0 1 4 ▇▃▁▁▁
HS 0 1 14.10 5.72 0 10 13 17 43 ▂▇▃▁▁
AS 0 1 11.25 4.74 0 8 11 14 30 ▂▇▅▁▁
HST 0 1 4.70 2.66 0 3 4 6 17 ▇▇▃▁▁
AST 0 1 3.84 2.24 0 2 4 5 15 ▇▆▂▁▁
HF 0 1 10.53 3.41 0 8 10 13 24 ▁▆▇▂▁
AF 0 1 11.06 3.50 1 9 11 13 26 ▂▇▆▁▁
HC 0 1 5.82 3.12 0 4 5 8 19 ▅▇▃▁▁
AC 0 1 4.69 2.67 0 3 4 6 15 ▇▇▃▁▁
HY 0 1 1.57 1.23 0 1 1 2 7 ▇▅▃▁▁
AY 0 1 1.75 1.28 0 1 2 3 9 ▇▇▂▁▁
HR 0 1 0.06 0.24 0 0 0 0 2 ▇▁▁▁▁
AR 0 1 0.08 0.27 0 0 0 0 2 ▇▁▁▁▁
epl_data |> 
  ggplot() +
  geom_point(aes(x = HTAG, y = FTAG)) +
  geom_smooth(aes(x = HTAG, y = FTAG), se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Failed to fit group -1.
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.

ref_cards <- epl_data |> 
  group_by(Referee) |> 
  transmute("YellowCard" = HY + AY, "RedCard" = HR + AR) |> 
  summarise(yellow = sum(YellowCard) / n(), red = sum(RedCard) / n()) |> 
  pivot_longer(cols = c(yellow, red), names_to = "card", values_to = "freq")

ref_cards
## # A tibble: 40 × 3
##    Referee    card    freq
##    <chr>      <chr>  <dbl>
##  1 A Marriner yellow 3.27 
##  2 A Marriner red    0.110
##  3 A Taylor   yellow 3.55 
##  4 A Taylor   red    0.107
##  5 C Kavanagh yellow 3.28 
##  6 C Kavanagh red    0.1  
##  7 C Pawson   yellow 3.44 
##  8 C Pawson   red    0.195
##  9 G Scott    yellow 2.58 
## 10 G Scott    red    0.12 
## # ℹ 30 more rows
ggplot(ref_cards, aes(Referee, freq, fill = card)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  scale_fill_manual(values = c("red" = "red", "yellow" = "yellow")) +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Expected Cards per Match vs Referee", y = "Expected Cards per Match", fill = "Card") 

# Create a new dataframe with only the relevant columns
card_data <- epl_data |> 
  select(HomeTeam, AwayTeam, Referee, HY, AY)

# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |> 
  transmute(team = HomeTeam, opp_team = AwayTeam, cards = HY, Referee) |> 
  mutate(location = "home")

away_cards <- card_data |> 
  transmute(team = AwayTeam, opp_team = HomeTeam, cards = AY, Referee) |> 
  mutate(location = "away")

# Combine the two groups into a single dataframe
all_cards <- rbind(home_cards, away_cards)

# Create a new column for the total number of cards given to each team by each referee
all_cards <- all_cards |> 
  group_by(team, Referee) |> 
  summarize(total_cards = sum(cards)) |> 
  filter(!is.na(total_cards)) |> 
  ungroup()
## `summarise()` has grouped output by 'team'. You can override using the
## `.groups` argument.
# Create a new column for the total number of cards given to each team by all referees
team_totals <- all_cards |> 
  group_by(team) |> 
  summarize(total_cards = sum(total_cards)) |> 
  filter(!is.na(total_cards)) |> 
  ungroup()

# Calculate the percentage of cards given to the home team by each referee
card_percentages <- all_cards |> 
  left_join(team_totals, by = "team") |> 
  mutate(card_percentage = total_cards.x / total_cards.y) |> 
  select(-total_cards.x, -total_cards.y)

# Create a heat map for each home team
x <- "Man United"

team_cards <- card_percentages |> 
  filter(team == x)

ggplot(team_cards, aes(x = Referee, y = card_percentage)) +
  geom_tile(aes(fill = card_percentage), color = "white") +
  scale_fill_gradient(low = "white", high = "blue") +
  labs(title = paste("Home Team:", x), x = "Referee", y = "Ratio of Cards to Home Team") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

# Create separate dataframes for home and away cards
home_cards <- ycard_data |> 
  mutate(location = "home") |> 
  select(HomeTeam, Referee, HY, location) |> 
  rename(team = HomeTeam, cards = HY)

away_cards <- ycard_data |> 
  mutate(location = "away") |> 
  select(AwayTeam, Referee, AY, location) |> 
  rename(team = AwayTeam, cards = AY)

# Combine the dataframes
all_cards <- rbind(home_cards, away_cards)

# Calculate total cards for each team and each referee
team_totals <- all_cards |> 
  group_by(Referee, location) |> 
  summarize(total_cards = sum(cards) / n()) |> 
  filter(!is.na(total_cards)) |> 
  ungroup()
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Calculate percentage of cards for each team and each referee
card_percentages <- team_totals |> 
  group_by(Referee, location) |> 
  summarize(card_percentage = mean(total_cards)) |> 
  ungroup()
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Plot yellow card percentages for home vs. away
ggplot(team_totals |>  group_by(Referee, location), aes(x = Referee, y = total_cards, fill = location)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("home" = "blue", "away" = "red")) +
  labs(title="Yellow Card Per Match vs. Referee", x = "Referee", y = "Yellow Card Per Match", fill = "") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

# Create separate dataframes for home and away cards
home_cards <- rcard_data |> 
  mutate(location = "home") |> 
  select(HomeTeam, Referee, HR, location) |> 
  rename(team = HomeTeam, cards = HR)

away_cards <- rcard_data |> 
  mutate(location = "away") |> 
  select(AwayTeam, Referee, AR, location) |> 
  rename(team = AwayTeam, cards = AR)

# Combine the dataframes
all_cards <- rbind(home_cards, away_cards)

# Calculate total cards for each team and each referee
team_totals <- all_cards |> 
  group_by(Referee, location) |> 
  summarize(total_cards = sum(cards) / n()) |> 
  filter(!is.na(total_cards)) |> 
  ungroup()
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# calculate percentage of cards for each team and each referee
card_percentages <- team_totals |> 
  group_by(Referee, location) |> 
  summarize(card_percentage = mean(total_cards)) |> 
  ungroup()
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Plot yellow card percentages for home vs. away
ggplot(team_totals |>  group_by(Referee, location), aes(x = Referee, y = total_cards, fill = location)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("home" = "blue", "away" = "red")) +
  labs(title="Red Card Per Match vs. Referee", x = "Referee", y = "Red Card Per Match", fill = "") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

# Preprocess the data (updated)
epl_data_processed_yellow <- epl_data |> 
  group_by(Referee) |> 
  filter(!is.na(AY), !is.na(HY), !is.na(HF), !is.na(AF)) |> 
  summarise(Yellow = AY + HY, Fouls = HF + AF, Ratio = Yellow / Fouls)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Create the boxplot
ggplot(epl_data_processed_yellow, aes(x = Referee, y = Ratio, group = Referee)) +
  geom_boxplot() +
  labs(title = "Yellow Card Ratio vs. Referee", x = "Referee", y = "Yellow Cards per Foul", fill = "") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

# Preprocess the data (updated)
epl_data_processe_red <- epl_data |> 
  group_by(Referee) |> 
  filter(!is.na(AR), !is.na(HR), !is.na(HF), !is.na(AF)) |> 
  summarise(Yellow = AR + HR, Fouls = HF + AF, Ratio = Yellow / Fouls)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Create the boxplot
ggplot(epl_data_processe_red, aes(x = Referee, y = Ratio, group = Referee)) +
  geom_boxplot() +
  labs(title = "Red Card Percentage vs. Referee", x = "Referee", y = "Red Card Percentage", fill = "") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

foul_referee <- epl_data |> 
  filter(!is.na(AY), !is.na(HY), !is.na(HF), !is.na(AF)) |> 
  group_by(Referee) |> 
  summarize(AF = sum(AF) / n(), HF = sum(HF) / n(), HY = sum(HY) / n(), AY = sum(AY) / n()) |> 
  pivot_longer(cols = c(AF, HF), names_to = "Location", values_to = "Fouls")

ggplot(foul_referee, aes(x = Referee, y = Fouls, fill = Location)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Fouls vs. Referee", x = "Referee", y = "Fouls Per Match") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_fill_discrete(labels = c("Home Fouls", "Away Fouls"))

# Define the big 6 Premier League teams
big_6 <- c("Arsenal", "Chelsea", "Liverpool", "Man City", "Man United", "Tottenham")

# Filter the dataset for the big 6 teams and calculate the overall median of fouls
epl_data_big_6_yellow <- epl_data |> 
  filter(HomeTeam %in% big_6) |> 
  mutate(Team = as.factor(HomeTeam), Ratio = HY / HF)

epl_data_big6_yellow <- epl_data %>% mutate(Ratio = HY / HF)

overall_median_fouls_big6_yellow <- median(epl_data_big6_yellow$Ratio, na.rm = TRUE)

# Create box plots for each of the big 6 teams and add a horizontal line for the overall median of fouls
ggplot(epl_data_big_6_yellow, aes(x = Team, y = Ratio, fill = Team)) +
  geom_boxplot() +
  geom_hline(yintercept = overall_median_fouls_big6_yellow, linetype = "dashed", color = "blue", size = 1) +
  labs(title = "Box Plots of Ratio of Yellow Cards to Fouls vs Home Team for the Big 6 Premier League Teams", x = "Home Team", y = "Ratio of Yellow Cards to Fouls", fill = "Team") +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

bad_6 <- c("Middlesbrough", "QPR", "Fulham", "Cardiff", "Wolves", "Huddersfield")

epl_data_bad_6_yellow <- epl_data |> 
  filter(HomeTeam %in% bad_6) |> 
  mutate(Team = as.factor(HomeTeam), Ratio = HY / HF)

epl_data_bad6_yellow <- epl_data |>  mutate(Ratio = HY / HF)

overall_median_fouls_bad6_yellow <- median(epl_data_bad6_yellow$Ratio, na.rm = TRUE)

ggplot(epl_data_bad_6_yellow, aes(x = Team, y = Ratio, fill = Team)) +
  geom_boxplot() +
  geom_hline(yintercept = overall_median_fouls_bad6_yellow, linetype = "dashed", color = "blue", size = 1) +
  labs(title = "Box Plots of Ratio of Yellow Cards to Fouls vs Home Team for the Bad 6 Premier League Teams", x = "Home Team", y = "Ratio of Yellow Cards to Fouls", fill = "Team") +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

big_6 <- c("Arsenal", "Chelsea", "Liverpool", "Man City", "Man United", "Tottenham")

# Filter the dataset for the big 6 teams and calculate the overall median of fouls

epl_data_big_6_red <- epl_data |> 
  filter(HomeTeam %in% big_6) |> 
  mutate(Team = as.factor(HomeTeam), Ratio = (HR / HF))

epl_data_big6_red <- epl_data %>% mutate(Ratio = HR / HF)

overall_median_fouls_big6_red <- median(epl_data_big6_red$Ratio, na.rm = TRUE)

# Create box plots for each of the big 6 teams and add a horizontal line for the overall median of fouls
ggplot(epl_data_big_6_red, aes(x = Team, y = Ratio, fill = Team)) +
  geom_boxplot() +
  geom_hline(yintercept = overall_median_fouls_big6_red, linetype = "dashed", color = "blue", size = 1) +
  labs(title = "Box Plots of Ratio of Red Cards to Fouls vs Home Team for the Big 6 Premier League Teams", x = "Home Team", y = "Ratio of Red Cards to Fouls", fill = "Team") +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

bad_6 <- c("Middlesbrough", "QPR", "Fulham", "Cardiff", "Wolves", "Huddersfield")

epl_data_bad_6_red <- epl_data |> 
  filter(HomeTeam %in% bad_6) |> 
  mutate(Team = as.factor(HomeTeam), Ratio = (HR))

epl_data_bad6_red <- epl_data %>% mutate(Ratio = (HR))

overall_median_fouls_bad6_red <- median(epl_data_bad6_red$Ratio, na.rm = TRUE)

ggplot(epl_data_bad_6_red, aes(x = Team, y = Ratio, fill = Team)) +
  geom_boxplot() +
  geom_hline(yintercept = overall_median_fouls_bad6_red, linetype = "dashed", color = "blue", size = 1) +
  labs(title = "Box Plots of Ratio of Red Cards to Fouls vs Home Team for the Bad 6 Premier League Teams", x = "Home Team", y = "Ratio of Red Cards to Fouls", fill = "Team") +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

ggplot(epl_data, aes(x = HomeTeam, y = HR / HF)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Red Cards and Fouls by Home Team", x = "Home Team", y = "Expected Red Cards per Match") +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

# Add yellow_per_game and red_per_game to the data frame
epl_data <- epl_data |> 
  group_by(HomeTeam) |> 
  mutate(
    fouls_per_game = HF / n(),
    yellow_per_game = HY / n(),
    red_per_game = HR / n()
  ) |> 
  ungroup()

# Create a graph of the average fouls per match for each team, sorting the x axis by the average number of fouls
ggplot(epl_data, aes(x = reorder(HomeTeam, -fouls_per_game), y = fouls_per_game)) +
  geom_bar(stat = "identity", position = "stack", fill = "blue") +
  labs(title = "Fouls by Home Team", x = "Home Team", y = "Fouls per Match") +
  scale_fill_manual(values = c("blue"), labels = c("Fouls")) +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

# Create a graph of the average yellow cards per match for each team
ggplot(epl_data, aes(x = reorder(HomeTeam, -yellow_per_game), y = yellow_per_game)) +
  geom_bar(stat = "identity", position = "stack", fill = "yellow") +
  labs(title = "Yellow Cards by Home Team", x = "Home Team", y = "Yellow Cards per Match") +
  scale_fill_manual(values = c("yellow"), labels = c("Yellow Cards")) +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

# Create a graph of the average red cards per match for each team
ggplot(epl_data, aes(x = reorder(HomeTeam, -red_per_game), y = red_per_game)) +
  geom_bar(stat = "identity", position = "stack", fill = "red") +
  labs(title = "Red Cards by Home Team", x = "Home Team", y = "Red Cards per Match") +
  scale_fill_manual(values = c("red"), labels = c("Red Cards")) +
  theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

# Create a new dataframe with only the relevant columns
card_data <- epl_data |> 
  select(HomeTeam, AwayTeam, Referee, HY, AY)

# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |> 
  transmute(team=HomeTeam, Referee, HY) |> 
  group_by(Referee, team) |> 
  summarize(cards = sum(HY), totalx = n()) 
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
away_cards <- card_data |> 
  transmute(team=AwayTeam, Referee, AY) |> 
  group_by(Referee, team) |> 
  summarize(cards = sum(AY), totaly = n()) 
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Combine the two groups into a single dataframe
all_cards <- left_join(home_cards, away_cards, by = c("Referee", "team"))

# Calculate the average number of yellow cards per game for each referee
all_cards <- all_cards |> 
  mutate(total_games = ifelse(is.na(totalx), 0, totalx) + ifelse(is.na(totaly), 0, totaly),
         total_cards = (ifelse(is.na(cards.x), 0, cards.x) + ifelse(is.na(cards.y), 0, cards.y)) / total_games) |> select(-totalx, -totaly, -cards.x, -cards.y) 

referees <- unique(card_percentages$Referee)

# Create a graph for each referee
for (x in referees) {
  team_cards <- all_cards |> 
    filter(Referee == x)

  print(
    ggplot(team_cards, aes(x = reorder(team, -total_cards), y = total_cards)) +
      geom_bar(stat = "identity",fill="yellow") +
      labs(title = paste("Referee:", x), x = "Team", y = "Average Yellow Cards per Game") +
      theme_minimal() +
      theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))
  )
}

# Create a new dataframe with only the relevant columns
card_data <- epl_data |> 
  select(HomeTeam, AwayTeam, Referee, HR, AR)

# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |>
  transmute(team = HomeTeam, Referee, HR) |> 
  group_by(Referee, team) |> 
  summarize(cards = sum(HR), totalx = n()) 
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
away_cards <- card_data |> 
  transmute(team=AwayTeam, Referee, AR) |> 
  group_by(Referee, team) |> 
  summarize(cards = sum(AR), totaly = n()) 
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Combine the two groups into a single dataframe
all_cards <- left_join(home_cards, away_cards, by = c("Referee", "team"))

# Calculate the average number of yellow cards per game for each Referee
all_cards <- all_cards |> 
  mutate(total_games = ifelse(is.na(totalx), 0,totalx)+ ifelse(is.na(totaly), 0, totaly),
         total_cards = (ifelse(is.na(cards.x),0,cards.x) + ifelse(is.na(cards.y),0,cards.y)) / total_games) |> 
  select(-totalx, -totaly, -cards.x, -cards.y) # remove unnecessary columns

referees <- unique(card_percentages$Referee)

# Create a graph for each referee
for (x in referees) {
  team_cards <- all_cards |> 
    filter(Referee == x)

  print(
    ggplot(team_cards, aes(x = reorder(team, -total_cards), y = total_cards)) +
      geom_bar(stat = "identity",fill="red") +
      labs(x = "Team", y = "Average Red Cards per Game", title = paste("Referee:", x)) +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
  )
}

# Create a new dataframe with only the relevant columns
card_data <- epl_data |> 
  select(HomeTeam, AwayTeam, Referee, HF, AF)

# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |> 
  transmute(team=HomeTeam, Referee, HF) |> 
  group_by(Referee, team) |> 
  summarize(cards = sum(HF), totalx = n()) 
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
away_cards <- card_data |>  
  transmute(team=AwayTeam, Referee, AF) |> 
  group_by(Referee, team) |> 
  summarize(cards = sum(AF), totaly = n()) 
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Combine the two groups into a single dataframe
all_cards <- left_join(home_cards, away_cards, by = c("Referee", "team"))

# Calculate the average number of yellow cards per game for each Referee
all_cards <- all_cards |> 
  mutate(total_games = ifelse(is.na(totalx), 0,totalx)+ ifelse(is.na(totaly), 0, totaly),
         total_cards = (ifelse(is.na(cards.x),0,cards.x) + ifelse(is.na(cards.y),0,cards.y)) / total_games) |>  
  select(-totalx, -totaly, -cards.x, -cards.y) 

referees <- unique(card_percentages$Referee)

# Create a graph for each referee
for (x in referees) {
  team_cards <- all_cards |> 
    filter(Referee == x)

  print(
    ggplot(team_cards, aes(x = reorder(team, -total_cards), y = total_cards)) +
      geom_bar(stat = "identity",fill="blue") +
      labs(x = "Team", y = "Average Fouls per Game", title = paste("Referee:", x)) +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
  )
}

# Count the number of fouls, yellow cards, and red cards booked against the Big 6 and Bad 6 teams
big_6 <- c("Arsenal", "Chelsea", "Liverpool", "Man City", "Man United", "Tottenham")
bad_6 <- c("Middlesbrough", "QPR", "Fulham", "Cardiff", "Wolves", "Huddersfield")

# Create a new dataframe with only the relevant columns
card_data <- epl_data |> 
  select(HomeTeam, AwayTeam, HF, AF, HY, AY, HR, AR)

# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |> 
  transmute(team=HomeTeam, HF, HY, HR) |> 
  group_by(team) |> 
  summarize(home_fouls = sum(HF), home_yellow_cards = sum(HY), home_red_cards = sum(HR))

away_cards <- card_data |> 
  transmute(team=AwayTeam, AF, AY, AR) |> 
  group_by(team) |> 
  summarize(away_fouls = sum(AF), away_yellow_cards = sum(AY), away_red_cards = sum(AR))

# Combine the two groups into a single dataframe
all_cards <- left_join(home_cards, away_cards, by = "team")

# Calculate the total number of fouls, yellow cards, and red cards per game for each team
all_cards <- all_cards |> 
  mutate(total_fouls = home_fouls + away_fouls,
         total_yellow_cards = home_yellow_cards + away_yellow_cards,
         total_red_cards = home_red_cards + away_red_cards) |> 
  select(-home_fouls, -away_fouls, -home_yellow_cards, -away_yellow_cards, -home_red_cards, -away_red_cards)

# Calculate the number of games played by each team
home_games_played <- epl_data |> 
  group_by(HomeTeam) |> 
  summarize(h_games_played = n()) |> 
  rename(team = HomeTeam)

away_games_played <- epl_data |> 
  group_by(AwayTeam) |> 
  summarize(a_games_played = n()) |> 
  rename(team = AwayTeam)
  
# Combine the two groups into a single dataframe
games_played <- left_join(home_games_played, away_games_played, by = "team")

# Calculate the total number of games played by each team
games_played <- games_played |> 
  mutate(total_games_played = h_games_played + a_games_played) |> 
  select(-h_games_played, -a_games_played)

# Calculate the average number of fouls, yellow cards, and red cards per game for each team
all_cards <- left_join(all_cards, games_played, by = "team") |> 
  mutate(avg_fouls = total_fouls / total_games_played,
         avg_yellow_cards = total_yellow_cards / total_games_played,
         avg_red_cards = total_red_cards / total_games_played) |> 
  select(-total_fouls, -total_yellow_cards, -total_red_cards, -total_games_played)

# Separate the data into two groups: cards given to the Big 6 and cards given to the Bad 6
big_6_cards <- all_cards |> 
  filter(team %in% big_6)

bad_6_cards <- all_cards |> 
  filter(team %in% bad_6)

CI <- function(data, alpha) {
  x_bar <- mean(data)
  s <- sd(data)
  n <- length(data)
  q <- qnorm(1 - alpha / 2)
  lower <- x_bar - q * s / sqrt(n)
  upper <- x_bar + q * s / sqrt(n)
  return(c(lower, upper))
}

# Determine if there is a significant difference between the number of fouls, yellow cards, and red cards booked against the Big 6 and Bad 6 teams
big_6_fouls <- mean(big_6_cards$avg_fouls)
bad_6_fouls <- mean(bad_6_cards$avg_fouls)
big_6_yellow_cards <- mean(big_6_cards$avg_yellow_cards)
bad_6_yellow_cards <- mean(bad_6_cards$avg_yellow_cards)
big_6_red_cards <- mean(big_6_cards$avg_red_cards)
bad_6_red_cards <- mean(bad_6_cards$avg_red_cards)

# Calculate the z-score for the difference in the average number of fouls, yellow cards, and red cards per game for the Big 6 and Bad 6
z_fouls <- (big_6_fouls - bad_6_fouls) / sqrt(var(big_6_cards$avg_fouls) / length(big_6_cards$avg_fouls) + var(bad_6_cards$avg_fouls) / length(bad_6_cards$avg_fouls))

z_yellow_cards <- (big_6_yellow_cards - bad_6_yellow_cards) / sqrt(var(big_6_cards$avg_yellow_cards) / length(big_6_cards$avg_yellow_cards) + var(bad_6_cards$avg_yellow_cards) / length(bad_6_cards$avg_yellow_cards))

z_red_cards <- (big_6_red_cards - bad_6_red_cards) / sqrt(var(big_6_cards$avg_red_cards) / length(big_6_cards$avg_red_cards) + var(bad_6_cards$avg_red_cards) / length(bad_6_cards$avg_red_cards))

# Calculate the p-value for the difference in the average number of fouls, yellow cards, and red cards per game for the Big 6 and Bad 6
p_fouls <- 2 * pnorm(-abs(z_fouls))
p_yellow_cards <- 2 * pnorm(-abs(z_yellow_cards))
p_red_cards <- 2 * pnorm(-abs(z_red_cards))

# Calculate the 95% confidence interval for the difference in the average number of fouls, yellow cards, and red cards per game for the Big 6 and Bad 6
big_6_bad_6_fouls_CI <- CI(big_6_cards$avg_fouls - bad_6_cards$avg_fouls, 0.05)
big_6_bad_6_yellow_cards_CI <- CI(big_6_cards$avg_yellow_cards - bad_6_cards$avg_yellow_cards, 0.05)
big_6_bad_6_red_cards_CI <- CI(big_6_cards$avg_red_cards - bad_6_cards$avg_red_cards, 0.05)
ggplot(epl_data%>% mutate(yellow=HY+AY, red=HR+AR,fouls=HF+AF), aes(x=fouls, y=yellow)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'